from uuid import uuid4
from pathlib import Path
from typing import Optional, Union, Callable, List

from pydantic import BaseModel, ConfigDict, field_validator, Field

from phi.agent import Agent, RunResponse
from phi.utils.log import logger, set_log_level_to_debug
from phi.utils.timer import Timer


class AccuracyResult(BaseModel):
    score: int = Field(..., description="Accuracy Score between 1 and 10 assigned to the Agent's answer.")
    reason: str = Field(..., description="Detailed reasoning for the accuracy score.")


class EvalResult(BaseModel):
    accuracy_score: int = Field(..., description="Accuracy Score between 1 to 10.")
    accuracy_reason: str = Field(..., description="Reasoning for the accuracy score.")


class Eval(BaseModel):
    # Evaluation name
    name: Optional[str] = None
    # Evaluation UUID (autogenerated if not set)
    eval_id: Optional[str] = Field(None, validate_default=True)
    # Agent to evaluate
    agent: Optional[Agent] = None

    # Question to evaluate
    question: str
    answer: Optional[str] = None
    # Expected Answer for the question
    expected_answer: str
    # Result of the evaluation
    result: Optional[EvalResult] = None

    accuracy_evaluator: Optional[Agent] = None
    # Guidelines for the accuracy evaluator
    accuracy_guidelines: Optional[List[str]] = None
    # Additional context to the accuracy evaluator
    accuracy_context: Optional[str] = None
    accuracy_result: Optional[AccuracyResult] = None

    # Save the result to a file
    save_result_to_file: Optional[str] = None

    # debug_mode=True enables debug logs
    debug_mode: bool = False

    model_config = ConfigDict(arbitrary_types_allowed=True)

    @field_validator("eval_id", mode="before")
    def set_eval_id(cls, v: Optional[str] = None) -> str:
        return v or str(uuid4())

    @field_validator("debug_mode", mode="before")
    def set_log_level(cls, v: bool) -> bool:
        if v:
            set_log_level_to_debug()
            logger.debug("Debug logs enabled")
        return v

    def get_accuracy_evaluator(self) -> Agent:
        if self.accuracy_evaluator is not None:
            return self.accuracy_evaluator

        try:
            from phi.model.openai import OpenAIChat
        except ImportError as e:
            logger.exception(e)
            logger.error(
                "phidata uses `openai` as the default model provider. Please run `pip install openai` to use the default evaluator."
            )
            exit(1)

        accuracy_guidelines = ""
        if self.accuracy_guidelines is not None and len(self.accuracy_guidelines) > 0:
            accuracy_guidelines = "\n## Guidelines for the AI Agent's answer:\n"
            accuracy_guidelines += "\n- ".join(self.accuracy_guidelines)
            accuracy_guidelines += "\n"

        accuracy_context = ""
        if self.accuracy_context is not None and len(self.accuracy_context) > 0:
            accuracy_context = "## Additional Context:\n"
            accuracy_context += self.accuracy_context
            accuracy_context += "\n"

        return Agent(
            model=OpenAIChat(id="gpt-4o-mini"),
            description=f"""\
You are an expert evaluator tasked with assessing the accuracy of an AI Agent's answer compared to an expected answer for a given question.
Your task is to provide a detailed analysis and assign a score on a scale of 1 to 10, where 10 indicates a perfect match to the expected answer.

## Question:
{self.question}

## Expected Answer:
{self.expected_answer}

## Evaluation Criteria:
1. Accuracy of information
2. Completeness of the answer
3. Relevance to the question
4. Use of key concepts and ideas
5. Overall structure and clarity of presentation
{accuracy_guidelines}{accuracy_context}
## Instructions:
1. Carefully compare the AI Agent's answer to the expected answer.
2. Provide a detailed analysis, highlighting:
   - Specific similarities and differences
   - Key points included or missed
   - Any inaccuracies or misconceptions
3. Explicitly reference the evaluation criteria and any provided guidelines in your reasoning.
4. Assign a score from 1 to 10 (use only whole numbers) based on the following scale:
   1-2: Completely incorrect or irrelevant
   3-4: Major inaccuracies or missing crucial information
   5-6: Partially correct, but with significant omissions or errors
   7-8: Mostly accurate and complete, with minor issues
   9-10: Highly accurate and complete, matching the expected answer closely

Your evaluation should be objective, thorough, and well-reasoned. Provide specific examples from both answers to support your assessment.""",
            response_model=AccuracyResult,
        )

    def run(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalResult]:
        logger.debug(f"*********** Evaluation Start: {self.eval_id} ***********")

        answer_to_evaluate: Optional[RunResponse] = None
        if answer is None:
            if self.agent is not None:
                logger.debug("Getting answer from agent")
                answer_to_evaluate = self.agent.run(self.question)
            if self.answer is not None:
                answer_to_evaluate = RunResponse(content=self.answer)
        else:
            try:
                if callable(answer):
                    logger.debug("Getting answer from callable")
                    answer_to_evaluate = RunResponse(content=answer())
                else:
                    answer_to_evaluate = RunResponse(content=answer)
            except Exception as e:
                logger.error(f"Failed to get answer: {e}")
                raise

        if answer_to_evaluate is None:
            raise ValueError("No Answer to evaluate.")
        else:
            self.answer = answer_to_evaluate.content

        logger.debug("************************ Evaluating ************************")
        logger.debug(f"Question: {self.question}")
        logger.debug(f"Expected Answer: {self.expected_answer}")
        logger.debug(f"Answer: {answer_to_evaluate}")
        logger.debug("************************************************************")

        logger.debug("Evaluating accuracy...")
        accuracy_evaluator = self.get_accuracy_evaluator()
        try:
            self.accuracy_result: AccuracyResult = accuracy_evaluator.run(
                answer_to_evaluate.content, stream=False
            ).content
        except Exception as e:
            logger.error(f"Failed to evaluate accuracy: {e}")
            return None

        if self.accuracy_result is not None:
            self.result = EvalResult(
                accuracy_score=self.accuracy_result.score,
                accuracy_reason=self.accuracy_result.reason,
            )

        # -*- Save result to file if save_result_to_file is set
        if self.save_result_to_file is not None and self.result is not None:
            try:
                fn_path = Path(self.save_result_to_file.format(name=self.name, eval_id=self.eval_id))
                if not fn_path.parent.exists():
                    fn_path.parent.mkdir(parents=True, exist_ok=True)
                fn_path.write_text(self.result.model_dump_json(indent=4))
            except Exception as e:
                logger.warning(f"Failed to save result to file: {e}")

        logger.debug(f"*********** Evaluation End: {self.eval_id} ***********")
        return self.result

    def print_result(self, answer: Optional[Union[str, Callable]] = None) -> Optional[EvalResult]:
        from phi.cli.console import console
        from rich.table import Table
        from rich.progress import Progress, SpinnerColumn, TextColumn
        from rich.box import ROUNDED

        response_timer = Timer()
        response_timer.start()
        with Progress(SpinnerColumn(spinner_name="dots"), TextColumn("{task.description}"), transient=True) as progress:
            progress.add_task("Working...")
            result: Optional[EvalResult] = self.run(answer=answer)

        response_timer.stop()
        if result is None:
            return None

        table = Table(
            box=ROUNDED,
            border_style="blue",
            show_header=False,
            title="[ Evaluation Result ]",
            title_style="bold sky_blue1",
            title_justify="center",
        )
        table.add_row("Question", self.question)
        table.add_row("Answer", self.answer)
        table.add_row("Expected Answer", self.expected_answer)
        table.add_row("Accuracy Score", f"{str(result.accuracy_score)}/10")
        table.add_row("Accuracy Reason", result.accuracy_reason)
        table.add_row("Time Taken", f"{response_timer.elapsed:.1f}s")
        console.print(table)

        return result
